This file aims to reproduce the findings of Buckler et al 2009, "The Genetic Architecture of Maize Flowering Time".
It used data from panzea
use_gpu_num = 1
import os
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
# TODO fixme
device = "cuda" if torch.cuda.is_available() else "cpu"
if use_gpu_num in [0, 1]:
torch.cuda.set_device(use_gpu_num)
print(f"Using {device} device")
import tqdm
import plotly.graph_objects as go
import plotly.express as px
# [e for e in os.listdir() if re.match(".+\\.txt", e)]
/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Using cuda device
nam_overview = pd.read_table('../ext_data/zma/panzea/phenotypes/Buckler_etal_2009_Science_flowering_time_data-090807/NAMSum0607FloweringTraitBLUPsAcross8Envs.txt')
nam_overview
| Geno_Code | Entry_ID | Group | pop | entry | Days_To_Anthesis_BLUP_Sum0607 | Days_To_Silk_BLUP_Sum0607 | ASI_BLUP_Sum0607 | |
|---|---|---|---|---|---|---|---|---|
| 0 | Z001E0001 | 04P1367A51A | Z001 | 1 | 1 | 75.5364 | 77.1298 | 1.4600 |
| 1 | Z001E0002 | 04P1368A51A | Z001 | 1 | 2 | 76.9075 | 77.7945 | 1.3928 |
| 2 | Z001E0003 | 04P1368B51A | Z001 | 1 | 3 | 75.2646 | 75.2555 | 0.8644 |
| 3 | Z001E0004 | 04P1370B51A | Z001 | 1 | 4 | 73.6933 | 75.7604 | 2.0012 |
| 4 | Z001E0005 | 04P1371B51A | Z001 | 1 | 5 | 79.2441 | 81.2611 | 1.8931 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5458 | Z027E0277 | W64A | NaN | 27 | 277 | 71.9008 | 73.9811 | 2.6756 |
| 5459 | Z027E0278 | WD | NaN | 27 | 278 | 62.0212 | 60.5992 | -0.5733 |
| 5460 | Z027E0279 | Wf9 | NaN | 27 | 279 | 71.9970 | 72.2319 | 0.8338 |
| 5461 | Z027E0280 | Yu796_NS | NaN | 27 | 280 | 74.5107 | 73.9727 | 0.2935 |
| 5462 | Z027E0282 | Mo17 | NaN | 27 | 282 | 72.7428 | 75.5080 | 3.0455 |
5463 rows × 8 columns
data = pd.read_table('../ext_data/zma/panzea/phenotypes/Buckler_etal_2009_Science_flowering_time_data-090807/markergenotypes062508.txt', skiprows=1
).reset_index().rename(columns = {'index': 'Geno_Code'})
data
| Geno_Code | days2anthesis | days2silk | asi | pop | i0 | i1 | i2 | i3 | i4 | ... | i1096 | i1097 | i1098 | i1099 | i1100 | i1101 | i1102 | i1103 | i1104 | i1105 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Z001E0001 | 75.5364 | 77.1298 | 1.4600 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | Z001E0002 | 76.9075 | 77.7945 | 1.3928 | 1 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 |
| 2 | Z001E0003 | 75.2646 | 75.2555 | 0.8644 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | Z001E0004 | 73.6933 | 75.7604 | 2.0012 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2.0 | 2.0 | 2.0 | 2.0 |
| 4 | Z001E0005 | 79.2441 | 81.2611 | 1.8931 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4694 | Z026E0196 | 77.6523 | 80.2916 | 1.9698 | 26 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| 4695 | Z026E0197 | 78.5015 | 82.2767 | 3.2979 | 26 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | ... | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 1.0 | 1.0 | 1.0 | 1.0 |
| 4696 | Z026E0198 | 77.4219 | 79.7868 | 2.2208 | 26 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4697 | Z026E0199 | 78.6712 | 82.8476 | 4.1247 | 26 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 |
| 4698 | Z026E0200 | 77.4937 | 82.4678 | 4.2915 | 26 | 0.0 | 0.0 | 0.0 | 2.0 | 2.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 |
4699 rows × 1111 columns
px.scatter_matrix(data.loc[:, ['days2anthesis', 'days2silk', 'asi']])
/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/plotly/express/_core.py:279: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead. dims = [
d2a = np.array(data['days2anthesis'])
d2s = np.array(data['days2silk'])
asi = np.array(data['asi'])
xs = np.array(data.drop(columns = ['days2anthesis', 'days2silk', 'asi', 'pop', 'Geno_Code']))
n_obs = xs.shape[0]
np_seed = 9070707
rng = np.random.default_rng(np_seed) # can be called without a seed
test_pr = 0.2
test_n = round(n_obs*test_pr)
idxs = np.linspace(0, n_obs-1, num = n_obs).astype(int)
rng.shuffle(idxs)
test_idxs = idxs[0:test_n]
train_idxs = idxs[test_n:-1]
# make up tensors
def calc_cs(x): return [np.mean(x, axis = 0), np.std(x, axis = 0)]
def apply_cs(xs, cs_dict_entry): return ((xs - cs_dict_entry[0]) / cs_dict_entry[0])
scale_dict = {
'd2a':calc_cs(d2a[train_idxs]),
'd2s':calc_cs(d2s[train_idxs]),
'asi':calc_cs(asi[train_idxs]),
'xs' :calc_cs(xs[train_idxs])
}
y1 = apply_cs(d2a, scale_dict['d2a'])
y2 = apply_cs(d2s, scale_dict['d2s'])
y3 = apply_cs(asi, scale_dict['asi'])
# No need to cs xs -- 0-2 scale
# apply_cs(xs, scale_dict['xs'])
y1_train = torch.from_numpy(y1[train_idxs]).to(device).float()[:, None]
y2_train = torch.from_numpy(y2[train_idxs]).to(device).float()[:, None]
y3_train = torch.from_numpy(y3[train_idxs]).to(device).float()[:, None]
xs_train = torch.from_numpy(xs[train_idxs]).to(device).float()
y1_test = torch.from_numpy(y1[test_idxs]).to(device).float()[:, None]
y2_test = torch.from_numpy(y2[test_idxs]).to(device).float()[:, None]
y3_test = torch.from_numpy(y3[test_idxs]).to(device).float()[:, None]
xs_test = torch.from_numpy(xs[test_idxs]).to(device).float()
class CustomDataset(Dataset):
def __init__(self, y1, y2, y3, xs, transform = None, target_transform = None):
self.y1 = y1
self.y2 = y2
self.y3 = y3
self.xs = xs
self.transform = transform
self.target_transform = target_transform
def __len__(self):
return len(self.y1)
def __getitem__(self, idx):
y1_idx = self.y1[idx]
y2_idx = self.y2[idx]
y3_idx = self.y3[idx]
xs_idx = self.xs[idx]
if self.transform:
xs_idx = self.transform(xs_idx)
if self.target_transform:
y1_idx = self.transform(y1_idx)
y2_idx = self.transform(y2_idx)
y3_idx = self.transform(y3_idx)
return xs_idx, y1_idx, y2_idx, y3_idx
training_dataloader = DataLoader(
CustomDataset(
y1 = y1_train,
y2 = y2_train,
y3 = y3_train,
xs = xs_train
),
batch_size = 64,
shuffle = True)
testing_dataloader = DataLoader(
CustomDataset(
y1 = y1_test,
y2 = y2_test,
y3 = y3_test,
xs = xs_test
),
batch_size = 64,
shuffle = True)
xs.shape
(4699, 1106)
y1 (Anthesis)¶class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.x_network = nn.Sequential(
nn.Linear(1106, 64),
nn.BatchNorm1d(64),
nn.ReLU(),
nn.Linear(64, 1))
def forward(self, x):
x_out = self.x_network(x)
return x_out
model = NeuralNetwork().to(device)
# print(model)
xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
model(xs_i).shape # try prediction on one batch
torch.Size([64, 1])
def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
size = len(dataloader.dataset)
for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
# Compute prediction and loss
pred = model(xs_i)
loss = loss_fn(pred, y1_i) # <----------------------------------------
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(y1_i) # <----------------
if not silent:
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def train_error(dataloader, model, loss_fn, silent = False):
size = len(dataloader.dataset)
num_batches = len(dataloader)
train_loss = 0
with torch.no_grad():
for xs_i, y1_i, y2_i, y3_i in dataloader:
pred = model(xs_i)
train_loss += loss_fn(pred, y1_i).item() # <----------------------
train_loss /= num_batches
return(train_loss)
def test_loop(dataloader, model, loss_fn, silent = False):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss = 0
with torch.no_grad():
for xs_i, y1_i, y2_i, y3_i in dataloader:
pred = model(xs_i)
test_loss += loss_fn(pred, y1_i).item() # <-----------------------
test_loss /= num_batches
if not silent:
print(f"Test Error: Avg loss: {test_loss:>8f}")
return(test_loss)
def train_nn(
training_dataloader,
testing_dataloader,
model,
learning_rate = 1e-3,
batch_size = 64,
epochs = 500
):
# Initialize the loss function
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
loss_df['TrainMSE'] = np.nan
loss_df['TestMSE'] = np.nan
for t in tqdm.tqdm(range(epochs)):
# print(f"Epoch {t+1}\n-------------------------------")
train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)
loss_df.loc[loss_df.index == t, 'TrainMSE'
] = train_error(training_dataloader, model, loss_fn, silent = True)
loss_df.loc[loss_df.index == t, 'TestMSE'
] = test_loop(testing_dataloader, model, loss_fn, silent = True)
return([model, loss_df])
# model, loss_df = train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 500
# )
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
# mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
# mode='lines', name='Test'))
# fig.show()
# ! conda install captum -c pytorch -y
# # imports from captum library
# from captum.attr import LayerConductance, LayerActivation, LayerIntegratedGradients
# from captum.attr import IntegratedGradients, DeepLift, GradientShap, NoiseTunnel, FeatureAblation
# ig = IntegratedGradients(model)
# ig_nt = NoiseTunnel(ig)
# dl = DeepLift(model)
# gs = GradientShap(model)
# fa = FeatureAblation(model)
# ig_attr_test = ig.attribute(xs_test, n_steps=50)
# ig_nt_attr_test = ig_nt.attribute(xs_test)
# dl_attr_test = dl.attribute(xs_test)
# gs_attr_test = gs.attribute(xs_test, xs_train)
# fa_attr_test = fa.attribute(xs_test)
# [e.shape for e in [ig_attr_test,
# ig_nt_attr_test,
# dl_attr_test,
# gs_attr_test,
# fa_attr_test]]
# fig = go.Figure()
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = ig_nt_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = dl_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = gs_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
# y = fa_attr_test.cpu().detach().numpy().mean(axis=0),
# mode='lines', name='Test'))
# fig.show()
# len(dl_attr_test.cpu().detach().numpy().mean(axis = 0))
y1 (Anthesis), y2 (Silking), and y3 (ASI)¶Here each model will predict 3 values. The loss function is still mse, but the y tensors are concatenated
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.x_network = nn.Sequential(
nn.Linear(1106, 64),
nn.BatchNorm1d(64),
nn.ReLU(),
nn.Linear(64, 3))
def forward(self, x):
x_out = self.x_network(x)
return x_out
model = NeuralNetwork().to(device)
# print(model)
xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
model(xs_i).shape # try prediction on one batch
torch.Size([64, 3])
def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
size = len(dataloader.dataset)
for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
# Compute prediction and loss
pred = model(xs_i)
loss = loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)) # <----------------------------------------
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(y1_i) # <----------------
if not silent:
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def train_error(dataloader, model, loss_fn, silent = False):
size = len(dataloader.dataset)
num_batches = len(dataloader)
train_loss = 0
with torch.no_grad():
for xs_i, y1_i, y2_i, y3_i in dataloader:
pred = model(xs_i)
train_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <----------------------
train_loss /= num_batches
return(train_loss)
def test_loop(dataloader, model, loss_fn, silent = False):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss = 0
with torch.no_grad():
for xs_i, y1_i, y2_i, y3_i in dataloader:
pred = model(xs_i)
test_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <-----------------------
test_loss /= num_batches
if not silent:
print(f"Test Error: Avg loss: {test_loss:>8f}")
return(test_loss)
def train_nn(
training_dataloader,
testing_dataloader,
model,
learning_rate = 1e-3,
batch_size = 64,
epochs = 500
):
# Initialize the loss function
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
loss_df['TrainMSE'] = np.nan
loss_df['TestMSE'] = np.nan
for t in tqdm.tqdm(range(epochs)):
# print(f"Epoch {t+1}\n-------------------------------")
train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)
loss_df.loc[loss_df.index == t, 'TrainMSE'
] = train_error(training_dataloader, model, loss_fn, silent = True)
loss_df.loc[loss_df.index == t, 'TestMSE'
] = test_loop(testing_dataloader, model, loss_fn, silent = True)
return([model, loss_df])
# model, loss_df = train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 500
# )
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
# mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
# mode='lines', name='Test'))
# fig.show()
# model, loss_df = train_nn(
# training_dataloader,
# testing_dataloader,
# model,
# learning_rate = 1e-3,
# batch_size = 64,
# epochs = 5000
# )
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
# mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
# mode='lines', name='Test'))
# fig.show()
'../ext_data/zma/panzea/phenotypes/'
'../ext_data/zma/panzea/phenotypes/'
# pd.read_table('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212.txt', low_memory = False)
# pd.read_excel('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212_TraitDescritptions.xlsx')